import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request  # 一个单独的request模块，需要跟进的url时，能够用到它
import re
import json

from spider_lianjia.items import newHouseItem


class LianjiaSpider(scrapy.Spider):
    name = 'lianjia'
    allowed_domains = ['cd.fang.lianjia.com']
    # start_urls = ['http://cd.fang.lianjia.com/']
    start_urls = 'https://cd.fang.lianjia.com/loupan/'
    cnt = 1
    # https: // cd.fang.lianjia.com / loupan / pg100 /


    def start_requests(self):
        for i in range(1, 101):
        # for i in range(1, 2):  # test
            page_url = self.start_urls + 'pg' + str(i) + '/'
            yield Request(page_url)


    def parse(self, response):
        info = newHouseItem()
        soup = BeautifulSoup(response.text, 'lxml')
        all = soup.find('div', class_='resblock-list-container clearfix').find('ul',
                                                                               class_='resblock-list-wrapper').find_all(
            'li', class_='resblock-list post_ulog_exposure_scroll has-results')
        for a in all:
            # print('#' * 10)
            # print(a)
            name = a.find('div', class_='resblock-name').find('a', class_='name').get_text()
            # print(name)
            type = a.find('div', class_='resblock-name').find('span', class_='resblock-type').get_text()
            # print(type)
            sale_state = a.find('span', class_='sale-status').get_text()
            # print(sale_state)
            location = a.find('div', class_='resblock-location').find('span').get_text()
            # print(location)
            area = a.find('div', class_='resblock-area').find('span').get_text()#建面 32㎡
            area = area.replace('㎡', '')
            area = area.replace('建面 ', '')
            ar=area.split('-')
            if len(ar)==1 and ar!=['']:
                area=re.findall('\d+',ar[0])[0]
            elif ar==[''] and len(ar)==2:
                # area = re.findall('\d+-\d+', area)[0]
                area='{0}-{1}'.format(ar[0],ar[1])
            # print(area)



            tag1 = a.find('div', class_='resblock-tag').find_all('span')
            tag = ''
            for t in tag1:
                tag=tag+'|'+t.get_text()
            tag=tag[1:]
            # print(tag)
            avr_price = a.find('div', class_='main-price').find('span', class_='number').get_text()
            # print(avr_price)
            link = a.find('div', class_='resblock-name').find('a').get('href')
            url = 'https://cd.fang.lianjia.com' + link
            # print(url)

            info['name'] = name
            info['type'] = type
            info['sale_state'] = sale_state
            info['location'] = location
            info['area'] = area
            info['tag'] = tag
            info['avr_price'] = avr_price
            info['link'] = url
            # print(info)
            yield info

        # self.cnt+=1
        # init_url='https://cd.fang.lianjia.com/loupan/pg'
        # newurl = '{0}{1}/'.format(init_url, self.cnt)
        # print(newurl)
        # yield Request(newurl, callback=self.parse)

        # for i in range(1, 101):
        #     newurl = '{0}pg{1}/'.format(url[:35],i)
        #     print(newurl)
        #     yield Request(newurl, callback=self.parse)

